source("../Rscripts/BaseScripts.R")
require(data.table)
require(plyr)
require(RColorBrewer)```bash
/home/jamcgirr/apps/angsd/misc/thetaStat print /home/ktist/ph/data/angsd/theta/PWS91_maf00.thetas.idx | gzip > /home/ktist/ph/data/angsd/theta/PWS91_maf00_thetas.pestPG.gz
# Run printTheta.sh at farm
<!-- rnb-source-end -->
<!-- rnb-chunk-end -->
<!-- rnb-text-begin -->
## Run R scripts at farm to create null distribution of genome-wide thetas
Run angsd_theta_siteshuffle_null.sh at farm, which runs Pi_shuffle_pws.R
- Takes long time, so create a script for each pop.year combination and run separately
Output from theta shuffling results are in Data/shuffle/theta.siteshuffle.50000.PWS91_PWS96.csv.gz
## Calculate p-values for differences in theta, pi, and Tajima's D using the results from shuffling
<!-- rnb-text-end -->
<!-- rnb-chunk-begin -->
<!-- rnb-source-begin eyJkYXRhIjpbIiNmcm9tIGNvZEV2b2wgYW5nc2RfdGhldGFfc2l0ZXNodWZmbGVfbnVsbF9zdGF0cy5SIiwiIiwiIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIiwiIyBsb2FkIGZ1bmN0aW9ucyIsIiMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyIsInJlcXVpcmUoZGF0YS50YWJsZSkiLCJyZXF1aXJlKHBseXIpIiwicmVxdWlyZShnZ3Bsb3QyKSIsInJlcXVpcmUoUkNvbG9yQnJld2VyKSIsIiIsIiIsIiMjIyMjIyMjIyMjIyMjIyMjIyMjIyIsIiMgcmVhZCBpbiBhbmQgcHJlcCBkYXRhIiwiIyMjIyMjIyMjIyMjIyMjIyMjIyMjIiwiIiwieWVhcnM8LWMoXCJQV1M5MVwiLFwiUFdTOTZcIixcIlBXUzA3XCIsXCJQV1MxN1wiKSIsImNvbWI8LXQoY29tYm4oeWVhcnMsIDIpKSIsIiIsImNocm1heCA8LSBmcmVhZCgnLi4vRGF0YS9uZXdfdmNmL2Nocl9zaXplcy5iZWQnKSIsImNocm1heDwtY2hybWF4WywtMl0iLCJjb2xuYW1lcyhjaHJtYXgpPC1jKFwiY2hyXCIsIFwibGVuXCIpIiwiY2hybWF4JHN0YXJ0PC1jKDAsY3Vtc3VtKGNocm1heCRsZW4pWzE6KG5yb3coY2hybWF4KS0xKV0pIiwiIiwiY2hybWF4JGVuZDwtY3Vtc3VtKGNocm1heCRsZW4pIiwiY2hybWF4JG1pZGRsZTwtKGNocm1heCRlbmQtY2hybWF4JHN0YXJ0KS8yK2Nocm1heCRzdGFydCIsIiIsIiNzZXRrZXkoY2hybWF4LCBjaHIpIiwiIiwiI0Z1bmN0aW9ucyB0byBjYWxjdWxhdGUgcC12YWx1ZXMgZnJvbSBjb2RFdm9sIiwiY2FsY3BHIDwtIGZ1bmN0aW9uKHRoZXRhY2hhbmdlLCBudWxsKXsgIyBmb3IgaW5jcmVhc2VzIGluIHRoZXRhIiwiICByZXR1cm4oKHN1bShudWxsID4gdGhldGFjaGFuZ2UpKzEpLyhsZW5ndGgobnVsbCkrMSkpICMgZXF1YXRpb24gZnJvbSBOb3J0aCBldCBhbC4gMjAwMiBBbSBKIEh1bSBHZW4iLCJ9IiwiY2FsY3BMIDwtIGZ1bmN0aW9uKHRoZXRhY2hhbmdlLCBudWxsKXsgIyBmb3IgZGVjcmVhc2VzIGluIHRoZXRhIiwiICByZXR1cm4oKHN1bShudWxsIDwgdGhldGFjaGFuZ2UpKzEpLyhsZW5ndGgobnVsbCkrMSkpICMgZXF1YXRpb24gZnJvbSBOb3J0aCBldCBhbC4gMjAwMiBBbSBKIEh1bSBHZW4iLCJ9IiwiIiwiIiwiY2hfY29scyA8LSBicmV3ZXIucGFsKDQsICdQYWlyZWQnKVtyZXAoMToyLDEzKV0iLCJEYXRhbGw8LWRhdGEudGFibGUoKSIsImZvciAocCBpbiAxOiBucm93KGNvbWIpKXsiLCIgICAgIyBtYXggdGhldGEgcGVyIGdlbm9tZSBmcm9tIHJlc2h1ZmZsaW5nIChhbGwgc2l0ZXMpIGZyb20gYW5nc2RfdGhldGFfc2l0ZXNodWZmbGVfbnVsbC5yIiwiICAgIG51bGw8LWZyZWFkKHBhc3RlMCgnLi4vRGF0YS9zaHVmZmxlL3RoZXRhLnNpdGVzaHVmZmxlLjUwMDAwLicsIGNvbWJbcCwxXSxcIl9cIixjb21iW3AsMl0sJy5jc3YuZ3onKSkiLCIgICAgIiwiICAgICN1cHBlciBhbmQgbG93ZXIgOTUlIiwiICAgIG51bGxbLCAuKHRXZF9sOTUgPSBxdWFudGlsZShtaW50V2QsIDAuMDUpLCB0V2RfdTk1ID0gcXVhbnRpbGUobWF4dFdkLCBwcm9icyA9IDAuOTUpLCIsIiAgICAgICAgICAgICAgICB0UGRfbDk1ID0gcXVhbnRpbGUobWludFBkLCAwLjA1KSwgdFBkX3U5NSA9IHF1YW50aWxlKG1heHRQZCwgcHJvYnMgPSAwLjk1KSwiLCIgICAgICAgICAgICAgICAgdERkX2w5NSA9IHF1YW50aWxlKG1pbnREZCwgMC4wNSksIHREZF91OTUgPSBxdWFudGlsZShtYXh0RGQsIHByb2JzID0gMC45NSkpXSIsIiIsIiAgICAjYXNzaWduKHBhc3RlMChcIm51bGwuXCIsY29tYltwLDFdLFwiX1wiLGNvbWJbcCwyXSksIG51bGwpICAgICIsIiAgICAiLCIgICAgIyBzbGlkaW5nIHdpbmRvd3MgdGhldGEgY2hhbmdlIChHQVRLIHNpdGVzKSBmcm9tIGFuZ3NkX3RoZXRhX3NpdGVzaHVmZmxlX251bGwuciIsIiAgICBkYXQ8LWZyZWFkKHBhc3RlMCgnLi4vRGF0YS9zaHVmZmxlL3RoZXRhX2NoYW5nZV9yZWdpb25fNTAwMDAuJywgY29tYltwLDFdLFwiX1wiLGNvbWJbcCwyXSwnLmNzdi5neicpLCBkcm9wID0gMSkiLCIgICAgZGF0Wyxwb3A6PXBhc3RlMChjb21iW3AsMV0sXCJfXCIsY29tYltwLDJdKV0iLCIgICAgIiwiICAgIGRhdDwtbWVyZ2UoZGF0LCBjaHJtYXhbLGMoXCJjaHJcIixcInN0YXJ0XCIpXSwgYnkueD1cIkNocm9tb1wiLCBieS55ID0gXCJjaHJcIikiLCIgICAgZGF0WywgUE9TZ2VuIDo9IFdpbkNlbnRlciArIHN0YXJ0XSIsIiAgICBkYXRbLHN0YXJ0IDo9IE5VTExdICNyZW1vdmUgc3RhcnQiLCIgICAgIiwiICAgICNjYWxjdWxhdGUgcC12YWx1ZXMiLCIgICAgIzEuIHRoZXRhVyBsb2NpIiwiICAgIGRhdFt0V2QgPiAwLCB0V2QucCA6PSBjYWxjcEcodFdkLCBudWxsJG1heHRXZCksIGJ5ID0gLihDaHJvbW8sIFdpbkNlbnRlcildICMgdGhldGFXICBsb2NpIiwiICAgIGRhdFt0V2QgPD0gMCwgdFdkLnAgOj0gY2FsY3BMKHRXZCwgbnVsbCRtaW50V2QpLCBieSA9IC4oQ2hyb21vLCBXaW5DZW50ZXIpXSIsIiAgICAjMi4gdGhldGEgcGkiLCIgICAgZGF0W3RQZCA+IDAsIHRQZC5wIDo9IGNhbGNwRyh0UGQsIG51bGwkbWF4dFBkKSwgYnkgPSAuKENocm9tbywgV2luQ2VudGVyKV0gIyB0aGV0YSBwaSIsIiAgICBkYXRbdFBkIDw9IDAsIHRQZC5wIDo9IGNhbGNwTCh0UGQsIG51bGwkbWludFBkKSwgYnkgPSAuKENocm9tbywgV2luQ2VudGVyKV0iLCIgICAgIiwiICAgICNUYWppbWEncyBEIiwiICAgIGRhdFt0RGQgPiAwLCB0RGQucCA6PSBjYWxjcEcodERkLCBudWxsJG1heHREZCksIGJ5ID0gLihDaHJvbW8sIFdpbkNlbnRlcildICMgdGFqaW1hJ3MgRCIsIiAgICBkYXRbdERkIDw9IDAsIHREZC5wIDo9IGNhbGNwTCh0RGQsIG51bGwkbWludERkKSwgYnkgPSAuKENocm9tbywgV2luQ2VudGVyKV0iLCIiLCIgICAgd3JpdGUuY3N2KGRhdCwgZmlsZT1nemZpbGUocGFzdGUwKCcuLi9PdXRwdXQvUGkvU2h1ZmZsZS90aGV0YV9zaXRlc2h1ZmZsZV8nLCBjb21iW3AsMV0sXCJfXCIsY29tYltwLDJdLCcuY3N2Lmd6JykpKSIsIiAgICAiLCIgICAgRGF0YWxsPC1yYmluZChEYXRhbGwsIGRhdCkiLCJ9IiwiIiwid3JpdGUuY3N2KERhdGFsbCwgZmlsZT1nemZpbGUocGFzdGUwKCcuLi9PdXRwdXQvUGkvU2h1ZmZsZS90aGV0YV9zaXRlc2h1ZmZsZV9QV1Nfc3VtbWFyeS5jc3YuZ3onKSkpIl19 -->
```r
#from codEvol angsd_theta_siteshuffle_null_stats.R
###########################
# load functions
###########################
require(data.table)
require(plyr)
require(ggplot2)
require(RColorBrewer)
#####################
# read in and prep data
#####################
years<-c("PWS91","PWS96","PWS07","PWS17")
comb<-t(combn(years, 2))
chrmax <- fread('../Data/new_vcf/chr_sizes.bed')
chrmax<-chrmax[,-2]
colnames(chrmax)<-c("chr", "len")
chrmax$start<-c(0,cumsum(chrmax$len)[1:(nrow(chrmax)-1)])
chrmax$end<-cumsum(chrmax$len)
chrmax$middle<-(chrmax$end-chrmax$start)/2+chrmax$start
#setkey(chrmax, chr)
#Functions to calculate p-values from codEvol
calcpG <- function(thetachange, null){ # for increases in theta
return((sum(null > thetachange)+1)/(length(null)+1)) # equation from North et al. 2002 Am J Hum Gen
}
calcpL <- function(thetachange, null){ # for decreases in theta
return((sum(null < thetachange)+1)/(length(null)+1)) # equation from North et al. 2002 Am J Hum Gen
}
ch_cols <- brewer.pal(4, 'Paired')[rep(1:2,13)]
Datall<-data.table()
for (p in 1: nrow(comb)){
# max theta per genome from reshuffling (all sites) from angsd_theta_siteshuffle_null.r
null<-fread(paste0('../Data/shuffle/theta.siteshuffle.50000.', comb[p,1],"_",comb[p,2],'.csv.gz'))
#upper and lower 95%
null[, .(tWd_l95 = quantile(mintWd, 0.05), tWd_u95 = quantile(maxtWd, probs = 0.95),
tPd_l95 = quantile(mintPd, 0.05), tPd_u95 = quantile(maxtPd, probs = 0.95),
tDd_l95 = quantile(mintDd, 0.05), tDd_u95 = quantile(maxtDd, probs = 0.95))]
#assign(paste0("null.",comb[p,1],"_",comb[p,2]), null)
# sliding windows theta change (GATK sites) from angsd_theta_siteshuffle_null.r
dat<-fread(paste0('../Data/shuffle/theta_change_region_50000.', comb[p,1],"_",comb[p,2],'.csv.gz'), drop = 1)
dat[,pop:=paste0(comb[p,1],"_",comb[p,2])]
dat<-merge(dat, chrmax[,c("chr","start")], by.x="Chromo", by.y = "chr")
dat[, POSgen := WinCenter + start]
dat[,start := NULL] #remove start
#calculate p-values
#1. thetaW loci
dat[tWd > 0, tWd.p := calcpG(tWd, null$maxtWd), by = .(Chromo, WinCenter)] # thetaW loci
dat[tWd <= 0, tWd.p := calcpL(tWd, null$mintWd), by = .(Chromo, WinCenter)]
#2. theta pi
dat[tPd > 0, tPd.p := calcpG(tPd, null$maxtPd), by = .(Chromo, WinCenter)] # theta pi
dat[tPd <= 0, tPd.p := calcpL(tPd, null$mintPd), by = .(Chromo, WinCenter)]
#Tajima's D
dat[tDd > 0, tDd.p := calcpG(tDd, null$maxtDd), by = .(Chromo, WinCenter)] # tajima's D
dat[tDd <= 0, tDd.p := calcpL(tDd, null$mintDd), by = .(Chromo, WinCenter)]
write.csv(dat, file=gzfile(paste0('../Output/Pi/Shuffle/theta_siteshuffle_', comb[p,1],"_",comb[p,2],'.csv.gz')))
Datall<-rbind(Datall, dat)
}
write.csv(Datall, file=gzfile(paste0('../Output/Pi/Shuffle/theta_siteshuffle_PWS_summary.csv.gz')))
## Plot the results ##
Datall<-fread('../Output/Pi/Shuffle/theta_siteshuffle_PWS_summary.csv.gz')
winsz = 5e4
#Changes in Pi between years
Datall$Chromo<-factor(Datall$Chromo, levels=c(paste0("chr",1:26)))
Datall$pop<-factor(Datall$pop, levels=c("PWS91_PWS96","PWS91_PWS07","PWS91_PWS17","PWS96_PWS07","PWS96_PWS17","PWS07_PWS17"))
ch_cols <- brewer.pal(4, 'Paired')[rep(1:2,13)]
ggplot(Datall, aes(POSgen, tPd/winsz, color = Chromo)) +
geom_point(size = 0.5, alpha = 0.3) +
facet_wrap(~pop, ncol = 1) +
scale_color_manual(values = ch_cols, guide="none") +
ylab('Change in pi per site')+xlab("Chromosome")+
ggtitle("Changes in Pi")+
scale_x_continuous(breaks=chrmax$middle, labels=1:26)+
theme_bw()
ggsave(paste0('../Output/Pi/Shuffle/Changes_in_Pi_PWS.png'), width = 7.5, height = 9, dpi = 300)
# plot theta_Watterson change
ggplot(Datall, aes(POSgen, tWd/winsz, color = Chromo)) +
geom_point(size = 0.5, alpha = 0.3) +
scale_color_manual(values = ch_cols, guide="none") +
facet_wrap(~pop, ncol = 1) +
ylab('Change in Wattersons theta per site')+xlab("Chromosome")+
ggtitle("Changes in Pi")+
scale_x_continuous(breaks=chrmax$middle, labels=1:26)+
theme_bw()
ggsave('../Output/Pi/Shuffle/Changes_in_thetaW_PWS.png', width = 7.5, height = 9, dpi = 300)
# plot Tajima's D change
ggplot(Datall, aes(POSgen, tDd, color = Chromo)) +
geom_point(size = 0.5, alpha = 0.3) +
scale_color_manual(values = ch_cols,guide="none") +
facet_wrap(~pop, ncol = 1) +
ylab('Change in Tajimas D per window')+xlab("Chromosome")+
ggtitle("Changes in Tajima's D")+
scale_x_continuous(breaks=chrmax$middle, labels=1:26)+
theme_bw()
ggsave('../Output/Pi/Shuffle/Changes_in_TajimasD_PWS.png', width = 7.5, height = 9, dpi = 300)# plot pi p-value vs. position
ch_cols <- brewer.pal(4, 'Paired')[rep(1:2,13)]
Datall$Chromo<-factor(Datall$Chromo, levels=c(paste0("chr",1:26)))
Datall$pop<-factor(Datall$pop, levels=c("PWS91_PWS96","PWS91_PWS07","PWS91_PWS17","PWS96_PWS07","PWS96_PWS17","PWS07_PWS17"))
ggplot(Datall, aes(POSgen, -log10(tPd.p)*sign(tPd), color = Chromo)) +
geom_point(size = 0.4, alpha = 0.3) +
facet_wrap(~pop, ncol = 1) +
scale_color_manual(values = ch_cols, guide="none") +xlab("Chromosome")+
ylab("log10(P-value)")+
geom_hline(yintercept = log10(0.05), linetype = 'dashed', color = 'red',size=0.3) +
geom_hline(yintercept = -log10(0.05), linetype = 'dashed', color = 'red', size=0.3)+
ggtitle("P-values for changes in Pi")+
scale_x_continuous(breaks=chrmax$middle, labels=1:26)+
theme_bw()
ggsave('../Output/Pi/Shuffle/Changes_in_Pi.siteshuffle.p-values_PWS.png', width = 7.5, height =11, units = 'in', dpi = 300)
# plot thetaW p-value vs. position (all loci)
ggplot(Datall, aes(POSgen, -log10(tWd.p)*sign(tWd), color = Chromo)) +
geom_point(size = 0.4, alpha = 0.3) +
facet_wrap(~pop, ncol = 1) +
scale_color_manual(values = ch_cols, guide="none") +xlab("Chromosome")+
ylab("log10(P-value)")+
geom_hline(yintercept = log10(0.05), linetype = 'dashed', color = 'red', size=0.3) +
geom_hline(yintercept = -log10(0.05), linetype = 'dashed', color = 'red', size=0.3)+
ggtitle("P-values for changes in Theta")+
scale_x_continuous(breaks=chrmax$middle, labels=1:26)+
theme_bw()
ggsave('../Output/Pi/Shuffle/Changes_in_thetaW.siteshuffle.p-values_PWS.png', width = 7.5, height =11, units = 'in', dpi = 300)
#plot Tajama's D p-value vs. position
ggplot(Datall, aes(POSgen, -log10(tDd.p)*sign(tDd), color = Chromo)) +
geom_point(size = 0.4, alpha = 0.3) +
facet_wrap(~pop, ncol = 1) +
scale_color_manual(values = ch_cols, guide="none") +xlab("Chromosome")+
ylab("log10(P-value)")+
geom_hline(yintercept = log10(0.05), linetype = 'dashed', color = 'red', size=0.3) +
geom_hline(yintercept = -log10(0.05), linetype = 'dashed', color = 'red', size=0.3)+
ggtitle("P-values for changes in Tajima's D")+
scale_x_continuous(breaks=chrmax$middle, labels=1:26)+
theme_bw()
ggsave('../Output/Pi/Shuffle/Changes_in_TajimaD.siteshuffle.p-values_PWS.png', width = 7.5, height =11, units = 'in', dpi = 300)# Outliers
Pi_outliers<-Datall[tPd.p < 0.05,]
Theta_outliers<-Datall[tWd.p < 0.05,]
TajimaD_outliers<-Datall[tDd.p < 0.05,] #no outliers
# Summary per chromosome per population
pi<-data.frame(table(Pi_outliers$pop, Pi_outliers$Chromo))
the<-data.frame(table(Theta_outliers$pop, Theta_outliers$Chromo))
D<-data.frame(table(TajimaD_outliers$pop, TajimaD_outliers$Chromo))
# Summary per population
pi2<-data.frame(table(Pi_outliers$pop))
the2<-data.frame(table(Theta_outliers$pop))
Ds2<-data.frame(table(TajimaD_outliers$pop))
#plot PWS91-96, 96-07, and 07-17
yrs<-c("PWS91_PWS96","PWS96_PWS07","PWS07_PWS17")
col3<-brewer.pal(5,"PuRd")[c(2,3,5)]
div1<-diverging_hcl(6, palette="Blue-Red")
#reverse the color
div2<-rev(div1)
pis<-pi[pi$Var1 %in% yrs,]
pis$Var1<-factor(pis$Var1, levels=yrs)
ggplot(pis, aes(x=Var2, y=Freq, fill=Var1))+
geom_bar(stat="identity",position=position_dodge(width=0.8))+
scale_fill_manual(values=col3)+
theme_classic()+
theme(axis.text.x = element_text(angle=45, hjust=1), legend.title = element_blank())+
ggtitle(expression(paste("Changes in ", pi)))+
xlab('')+ylab('Number of regions with P<0.05')
ggsave("../Output/Pi/Shuffle/Pi_significant_perChrom_perPop.png", width = 8, height = 4, dpi=300)
# Per population comparison
ggplot(pi2, aes(x=Var1, y=Freq, fill=Var1))+
geom_bar(stat="identity",position=position_dodge(width=0.8))+
scale_fill_manual(values=div1)+
theme_classic()+
theme(axis.text.x = element_text(angle=45, hjust=1), legend.title = element_blank())+
ggtitle(expression(paste("Changes in ", pi)))+
xlab('')+ylab('Number of regions with P<0.05')
ggsave("../Output/Pi/Shuffle/Pi_significant_perComparison.png", width = 5, height = 4, dpi=300)
#ordered
pi3<-pi2[order(pi2$Freq, decreasing = T),]
pi3$Var1<-factor(pi3$Var1, levels=paste0(unique(pi3$Var1)))
ggplot(pi3, aes(x=Var1, y=Freq, fill=Var1))+
geom_bar(stat="identity",position=position_dodge(width=0.8))+
scale_fill_manual(values=div2)+
theme_classic()+
theme(axis.text.x = element_text(angle=45, hjust=1), legend.title = element_blank())+
ggtitle(expression(paste("Changes in ", pi, " (shuffle)")))+
xlab('')+ylab('Number of regions with P<0.05')
ggsave("../Output/Pi/Shuffle/Pi_significant_perComparison_ordered.png", width = 5, height = 4, dpi=300)
#Assign colors to the comparison group
names(div2)<- unique(pi3$Var1)#Thetea
ths<-the[the$Var1 %in% yrs,]
ths$Var1<-factor(ths$Var1, levels=yrs)
ggplot(ths, aes(x=Var2, y=Freq, fill=Var1))+
geom_bar(stat="identity",position=position_dodge(width=0.8))+
scale_fill_manual(values=col3)+
theme_classic()+
theme(axis.text.x = element_text(angle=45, hjust=1), legend.title = element_blank())+
ggtitle(paste0("Changes in theta"))+
xlab('')+ylab('Number of regions with P<0.05')
ggsave("../Output/Pi/Shuffle/Theta_significant_perChrom_perPop.png", width = 5, height = 3, dpi=300)
ggplot(the2, aes(x=Var1, y=Freq, fill=Var1))+
geom_bar(stat="identity",position=position_dodge(width=0.8))+
scale_fill_manual(values=div1)+
theme_classic()+
theme(axis.text.x = element_text(angle=45, hjust=1), legend.title = element_blank())+
ggtitle(paste("Changes in theta"))+
xlab('')+ylab('Number of regions with P<0.05')
ggsave("../Output/Pi/Shuffle/Theta_significant_perComparison.png", width = 5, height = 4, dpi=300)
the3<-the2[order(the2$Freq, decreasing = T),]
the3$Var1<-factor(the3$Var1, levels=paste0(unique(the3$Var1)))
ggplot(the3, aes(x=Var1, y=Freq, fill=Var1))+
geom_bar(stat="identity",position=position_dodge(width=0.8))+
scale_fill_manual(name = "Var1",values = div2)+
theme_classic()+
theme(axis.text.x = element_text(angle=45, hjust=1), legend.title = element_blank())+
ggtitle(paste("Changes in theta"))+
xlab('')+ylab('Number of regions with P<0.05')
ggsave("../Output/Pi/Shuffle/Theta_significant_perComparison_ordered.png", width = 5, height = 4, dpi=300) # Tajima's D
ggplot(Ds2, aes(x=Var1, y=Freq, fill=Var1))+
geom_bar(stat="identity",position=position_dodge(width=0.8))+
scale_fill_manual(values=div1)+
theme_classic()+
theme(axis.text.x = element_text(angle=45, hjust=1), legend.title = element_blank())+
ggtitle(paste0("Changes in Tajima's D"))+
xlab('')+ylab('Number of regions with P<0.05')
ggsave("../Output/Pi/Shuffle/TajimaD_significant_perComparison.png", width = 5, height = 4, dpi=300)
D2<-D[D$Var1 %in% yrs,]
D2$Var1<-factor(D2$Var1, levels=yrs)
ggplot(D2, aes(x=Var2, y=Freq, fill=Var1))+
geom_bar(stat="identity",position=position_dodge(width=0.8))+
scale_fill_manual(values=col3)+
theme_classic()+
theme(axis.text.x = element_text(angle=45, hjust=1), legend.title = element_blank())+
ggtitle(paste0("Changes in Tajima's D"))+
xlab('')+ylab('Number of regions with P>0.05')
#ggsave("../Output/Pi/Shuffle/TajimaD_significant_perChrom_perPop.png", width = 8, height = 4, dpi=300)
```bash
#run FstPWSprint.sh
/home/jamcgirr/apps/angsd/misc/realSFS fst print /home/ktist/ph/data/angsd/analysis/fst_PWS91_PWS07_persite_maf00.fst.idx > /home/ktist/ph/data/angsd/analysis/fst_PWS91_PWS07_persite_maf00.txt
bgzip /home/ktist/ph/data/angsd/analysis/fst_PWS91_PWS07_persite_maf00.txt
<!-- rnb-source-end -->
<!-- rnb-chunk-end -->
<!-- rnb-text-begin -->
2. Run Fst_shuffle_pws.R at Farm (slurm script: angsd_fst_siteshuffle_null.sh)
<!-- rnb-text-end -->
<!-- rnb-chunk-begin -->
<!-- rnb-source-begin eyJkYXRhIjpbIiMgRnJvbSBodHRwczovL2dpdGh1Yi5jb20vcGluc2t5bGFiL2NvZEV2b2wgYW5nc2RfZnN0X3NpdGVzaHVmZmxlX251bGwuciIsIiIsIiMgc2h1ZmZsZSBBTkdTRCBwZXJzaXRlIEZTVCBBIGFuZCBCIHZhbHVlcyBhY3Jvc3Mgc2l0ZXMgYW5kIGNhbGN1bGF0ZSB3aW5kb3dlZCBGU1QgdG8gZ2V0IGEgbnVsbCBkaXN0cmlidXRpb24gb2YgbWF4IGdlbm9tZS13aWRlIEZTVCIsIiMgbmVlZCBmc3QgcGVyc2l0ZSBvdXRwdXQgZnJvbSBhbmdzZCBmc3RfKl9wZXJzaXRlX21hZjAwLnR4dC5neiBmaWxlcyIsIiIsIiMgdG8gcnVuIG9uIGZhcm0iLCIiLCIiLCIjIyMjIyBSIGNvZGUgICMjIyMjIyMgIiwiIiwiIyBwYXJhbWV0ZXJzIiwid2luc3ogPC0gNTAwMDAgIyB3aW5kb3cgc2l6ZSIsIndpbnN0cCA8LSAxMDAwMCAjIHdpbmRvdyBzdGVwIiwibnJlcCA8LSAxMDAwICMgbnVtYmVyIG9mIHJlc2h1ZmZsZXMiLCJtaW5sb2NpIDwtIDEwICMgbWluaW11bSBudW1iZXIgb2YgbG9jaSBwZXIgd2luZG93IHRvIGNvbnNpZGVyIiwiIiwiIiwiIyBsb2FkIGZ1bmN0aW9ucyIsInJlcXVpcmUoZGF0YS50YWJsZSkiLCIiLCIiLCIjIyMjIyMjIyMjIyMjIiwiIyBQcmVwIGRhdGEiLCIjIyMjIyMjIyMjIyMjIiwiIyBsb2FkIGZzdCBBL0IgZGF0YSIsIiNjYW4gPC0gZnJlYWQoJ2FuYWx5c2lzL0Nhbl80MC5DYW5fMTQuZnN0LkFCLmd6JykiLCIjc2V0bmFtZXMoY2FuLCBjKCdDSFJPTScsICdQT1MnLCAnQScsICdCJykpIiwiIiwicHdzOTE5NiA8LSBmcmVhZCgnL2hvbWUva3Rpc3QvcGgvZGF0YS9hbmdzZC9hbmFseXNpcy9mc3RfUFdTOTFfUFdTOTZfcGVyc2l0ZV9tYWYwMC50eHQuZ3onKSIsInNldG5hbWVzKHB3czkxOTYsIGMoJ0NIUk9NJywgJ1BPUycsICdBJywgJ0InKSkiLCJwd3M5MTA3IDwtIGZyZWFkKCcvaG9tZS9rdGlzdC9waC9kYXRhL2FuZ3NkL2FuYWx5c2lzL2ZzdF9QV1M5MV9QV1MwN19wZXJzaXRlX21hZjAwLnR4dC5neicpIiwic2V0bmFtZXMocHdzOTEwNywgYygnQ0hST00nLCAnUE9TJywgJ0EnLCAnQicpKSIsInB3czkxMTcgPC0gZnJlYWQoJy9ob21lL2t0aXN0L3BoL2RhdGEvYW5nc2QvYW5hbHlzaXMvZnN0X1BXUzkxX1BXUzE3X3BlcnNpdGVfbWFmMDAudHh0Lmd6JykiLCJzZXRuYW1lcyhwd3M5MTE3LCBjKCdDSFJPTScsICdQT1MnLCAnQScsICdCJykpIiwicHdzOTYwNyA8LSBmcmVhZCgnL2hvbWUva3Rpc3QvcGgvZGF0YS9hbmdzZC9hbmFseXNpcy9mc3RfUFdTOTZfUFdTMDdfcGVyc2l0ZV9tYWYwMC50eHQuZ3onKSIsInNldG5hbWVzKHB3czk2MDcsIGMoJ0NIUk9NJywgJ1BPUycsICdBJywgJ0InKSkiLCJwd3M5NjE3IDwtIGZyZWFkKCcvaG9tZS9rdGlzdC9waC9kYXRhL2FuZ3NkL2FuYWx5c2lzL2ZzdF9QV1M5Nl9QV1MxN19wZXJzaXRlX21hZjAwLnR4dC5neicpIiwic2V0bmFtZXMocHdzOTYxNywgYygnQ0hST00nLCAnUE9TJywgJ0EnLCAnQicpKSIsInB3czA3MTcgPC0gZnJlYWQoJy9ob21lL2t0aXN0L3BoL2RhdGEvYW5nc2QvYW5hbHlzaXMvZnN0X1BXUzA3X1BXUzE3X3BlcnNpdGVfbWFmMDAudHh0Lmd6JykiLCJzZXRuYW1lcyhwd3MwNzE3LCBjKCdDSFJPTScsICdQT1MnLCAnQScsICdCJykpIiwiIiwiUFdTPC1saXN0KCkiLCJQV1NbWzFdXTwtcHdzOTE5NiIsIlBXU1tbMl1dPC1wd3M5MTA3IiwiUFdTW1szXV08LXB3czkxMTciLCJQV1NbWzRdXTwtcHdzOTYwNyIsIlBXU1tbNV1dPC1wd3M5NjE3ICIsIlBXU1tbNl1dPC1wd3MwNzE3ICIsIiAgICAiLCIgIiwiIyBjcmVhdGUgbmV3IGNvbHVtbnMgYXMgaW5kaWNlcyBmb3Igd2luZG93cyIsIiIsImZvciAoaSBpbiAxOiBsZW5ndGgoUFdTKSl7IiwiICAgIGRmPC1QV1NbW2ldXSIsIiAgICBmb3IoaiBpbiAxOih3aW5zei93aW5zdHApKXsiLCIgICAgICAgIGRmWywgKHBhc3RlMCgnd2luJywgaikpIDo9IGZsb29yKChQT1MgLSAoai0xKSp3aW5zdHApL3dpbnN6KSp3aW5zeiArIHdpbnN6LzIgKyAoai0xKSp3aW5zdHBdIiwiICAgIH0iLCIgICAgUFdTW1tpXV08LWRmIiwifSIsIiIsIiMgbWFyayB3aW5kb3dzIHdpdGggPCBtaW5sb2NpIGZvciByZW1vdmFsIiwicmVtIDwtIHJlcCgwLCA2KSAjIG51bWJlciBvZiB3aW5kb3dzIHJlbW92ZWQgZm9yIGVhY2ggb2YgdGhlIDYgY29tcGFyaXNvbnMiLCIiLCJmb3IoaSBpbiAxOiBsZW5ndGgoUFdTKSl7IiwiICAgIHB3PC1QV1NbW2ldXSIsIiAgICBmb3IoaiBpbiAxOih3aW5zei93aW5zdHApKXsiLCIgICAgICAgIHB3d2luIDwtIHB3WywgLihuc25wcyA9IGxlbmd0aChQT1MpKSwgYnkgPSAuKHdpbiA9IGdldChwYXN0ZTAoJ3dpbicsIGopKSldICMgY2FsYyBudW0gc25wcyBwZXIgd2luZG93IiwiICAgICAgICByZW1bMV0gPC0gcmVtWzFdICsgcHd3aW5bLCBzdW0obnNucHMgPCBtaW5sb2NpKV0gIyByZWNvcmQgbnVtYmVyIHRvIGJlIHJlbW92ZWQiLCIgICAgICAgIHB3d2luWywgKHBhc3RlMCgnd2luJywgaiwgJ2tlZXAnKSkgOj0gMV0gIyBjcmVhdGUgY29sIHRvIG1hcmsgd2hpY2ggd2luZG93cyB0byBrZWVwIiwiICAgICAgICBwd3dpbltuc25wcyA8IG1pbmxvY2ksIChwYXN0ZTAoJ3dpbicsIGosICdrZWVwJykpIDo9IDBdICMgbWFyayB3aW5kb3dzIHRvIHJlbW92ZSIsIiAgICAgICAgcHd3aW5bLCBuc25wcyA6PSBOVUxMXSAjIGRyb3AgY29sdW1uIiwiICAgICAgICBzZXRuYW1lcyhwd3dpbiwgXCJ3aW5cIiwgcGFzdGUwKCd3aW4nLCBqKSkgIyBjaGFuZ2UgY29sIG5hbWUiLCIgICAgICAgIHB3IDwtIG1lcmdlKHB3LCBwd3dpbiwgYnkgPSBwYXN0ZTAoJ3dpbicsIGopLCBhbGwueCA9IFRSVUUpICMgbWVyZ2Uga2VlcGVyIGNvbCBiYWNrIHRvIGZ1bGwgZGF0YXNldCIsIiAgICB9IiwiICAgIFBXU1tbaV1dPC1wdyIsIn0iLCIiLCJyZW0gIyBudW1iZXIgb2Ygd2luZG93cyByZW1vdmVkIGZvciBlYWNoIGNvbXBhcmlzb24iLCIiLCIiLCIiLCIiLCIjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMiLCIjIHNodWZmbGUgYW5kIHJlY2FsYyB3aW5kb3dlZCBGU1QiLCIjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMiLCJjb2xubXMgPC0gYygnQ0hST00nLCAnUE9TJywgcGFzdGUwKCd3aW4nLCAxOih3aW5zei93aW5zdHApKSwgcGFzdGUwKCd3aW4nLCAxOih3aW5zei93aW5zdHApLCAna2VlcCcpKSAjIGxpc3Qgb2YgY29sdW1uIG5hbWVzIHdlIHdhbnQgb3V0IG9mIHRoZSBiYXNlIGRhdGEudGFibGUiLCIiLCIjIFBXUzA3LTE3IiwicG9wczwtYyhcIlBXUzkxLjk2XCIsXCJQV1M5MS4wN1wiLFwiUFdTOTEuMTdcIixcIlBXUzk2LjA3XCIsXCJQV1M5Ni4xN1wiLFwiUFdTMDcuMTdcIikiLCIiLCIiLCJmb3IocCBpbiAxOmxlbmd0aChQV1MpKXsiLCIgICAgcHJpbnQocGFzdGUwKCdTdGFydGluZyAnLCBwb3BzW3BdKSkiLCIiLCIgICAgcHc8LVBXU1tbcF1dIiwiICAgIGZvcihpIGluIDE6bnJlcCl7IiwiICAgICAgICBjYXQoaSk7IGNhdCgnICcpIiwiICAgICAgICAjIGNyZWF0ZSBuZXcgZGF0YXNldCIsIiAgICAgICAgaW5kcyA8LSBzYW1wbGUoMTpucm93KHB3KSwgbnJvdyhwdyksIHJlcGxhY2UgPSBGQUxTRSkiLCIgICAgICAgIHRlbXAgPC0gY2JpbmQocHdbLCAuLmNvbG5tc10sIHB3W2luZHMsIC4oQSwgQildKSAjIHNodWZmbGUgRlNUcyBhY3Jvc3MgcG9zaXRpb25zIiwiICAgICAgICAiLCIgICAgICAgICMgY2FsYyBmc3QgZm9yIGVhY2ggd2luZG93IHRvIGtlZXAiLCIgICAgICAgIGZvcihqIGluIDE6KHdpbnN6L3dpbnN0cCkpeyIsIiAgICAgICAgICAgIHRlbXAyIDwtIHRlbXBbZ2V0KHBhc3RlMCgnd2luJywgaiwgJ2tlZXAnKSkgPT0gMSwgXSAjIHRyaW0gdG8gd2luZG93cyB0byBrZWVwLiBjYW4ndCBjb21iaW5lIHdpdGggbmV4dCBsaW5lIGZvciBzb21lIHJlYXNvbi4iLCIgICAgICAgICAgICBpZihqID09MSkgdGVtcGZzdHMgPC0gdGVtcDJbLCAuKGZzdCA9IHN1bShBKS9zdW0oQikpLCBieSA9IC4oQ0hST00sIFBPUyA9IGdldChwYXN0ZTAoJ3dpbicsIGopKSldIiwiICAgICAgICAgICAgaWYoaiA+IDEpIHRlbXBmc3RzIDwtIHJiaW5kKHRlbXBmc3RzLCB0ZW1wMlssIC4oZnN0ID0gc3VtKEEpL3N1bShCKSksIGJ5ID0gLihDSFJPTSwgUE9TID0gZ2V0KHBhc3RlMCgnd2luJywgaikpKV0pIiwiICAgICAgICB9IiwiICAgICAgICAiLCIgICAgICAgICMgc2F2ZSB0aGUgbWF4IHdpbmRvd2VkIGZzdCIsIiAgICAgICAgIyBleGNsdWRlIHdpbmRvd3Mgd2l0aCBuZWdhdGl2ZSBtaWRwb2ludHMiLCIgICAgICAgIGlmKGkgPT0gMSkgbWF4ZnN0IDwtIHRlbXBmc3RzW1BPUyA+IDAsIG1heChmc3QsIG5hLnJtID0gVFJVRSldXHQiLCIgICAgICAgIGlmKGkgPiAxKSBtYXhmc3QgPC0gYyhtYXhmc3QsIHRlbXBmc3RzW1BPUyA+IDAsIG1heChmc3QsIG5hLnJtID0gVFJVRSldKSIsIiAgICB9IiwiIiwiICAgIHByaW50KHBhc3RlKCdNYXg6JywgbWF4KG1heGZzdCwgbmEucm0gPSBUUlVFKSwgJzsgOTV0aDonLCBxdWFudGlsZShtYXhmc3QsIHByb2IgPSAwLjk1LCBuYS5ybSA9IFRSVUUpKSkiLCIgICAgIiwiICAgIHdyaXRlLmNzdihtYXhmc3QsIGd6ZmlsZShwYXN0ZTAoJy9ob21lL2t0aXN0L3BoL2RhdGEvYW5nc2QvYW5hbHlzaXMvJyxwb3BzW3BdLCAnX3NpdGVzaHV1ZmZsZS5jc3YuZ3onKSksIHJvdy5uYW1lcyA9IEZBTFNFKSIsIiAgICBybShtYXhmc3QpIiwifSJdfQ== -->
```r
# From https://github.com/pinskylab/codEvol angsd_fst_siteshuffle_null.r
# shuffle ANGSD persite FST A and B values across sites and calculate windowed FST to get a null distribution of max genome-wide FST
# need fst persite output from angsd fst_*_persite_maf00.txt.gz files
# to run on farm
##### R code #######
# parameters
winsz <- 50000 # window size
winstp <- 10000 # window step
nrep <- 1000 # number of reshuffles
minloci <- 10 # minimum number of loci per window to consider
# load functions
require(data.table)
#############
# Prep data
#############
# load fst A/B data
#can <- fread('analysis/Can_40.Can_14.fst.AB.gz')
#setnames(can, c('CHROM', 'POS', 'A', 'B'))
pws9196 <- fread('/home/ktist/ph/data/angsd/analysis/fst_PWS91_PWS96_persite_maf00.txt.gz')
setnames(pws9196, c('CHROM', 'POS', 'A', 'B'))
pws9107 <- fread('/home/ktist/ph/data/angsd/analysis/fst_PWS91_PWS07_persite_maf00.txt.gz')
setnames(pws9107, c('CHROM', 'POS', 'A', 'B'))
pws9117 <- fread('/home/ktist/ph/data/angsd/analysis/fst_PWS91_PWS17_persite_maf00.txt.gz')
setnames(pws9117, c('CHROM', 'POS', 'A', 'B'))
pws9607 <- fread('/home/ktist/ph/data/angsd/analysis/fst_PWS96_PWS07_persite_maf00.txt.gz')
setnames(pws9607, c('CHROM', 'POS', 'A', 'B'))
pws9617 <- fread('/home/ktist/ph/data/angsd/analysis/fst_PWS96_PWS17_persite_maf00.txt.gz')
setnames(pws9617, c('CHROM', 'POS', 'A', 'B'))
pws0717 <- fread('/home/ktist/ph/data/angsd/analysis/fst_PWS07_PWS17_persite_maf00.txt.gz')
setnames(pws0717, c('CHROM', 'POS', 'A', 'B'))
PWS<-list()
PWS[[1]]<-pws9196
PWS[[2]]<-pws9107
PWS[[3]]<-pws9117
PWS[[4]]<-pws9607
PWS[[5]]<-pws9617
PWS[[6]]<-pws0717
# create new columns as indices for windows
for (i in 1: length(PWS)){
df<-PWS[[i]]
for(j in 1:(winsz/winstp)){
df[, (paste0('win', j)) := floor((POS - (j-1)*winstp)/winsz)*winsz + winsz/2 + (j-1)*winstp]
}
PWS[[i]]<-df
}
# mark windows with < minloci for removal
rem <- rep(0, 6) # number of windows removed for each of the 6 comparisons
for(i in 1: length(PWS)){
pw<-PWS[[i]]
for(j in 1:(winsz/winstp)){
pwwin <- pw[, .(nsnps = length(POS)), by = .(win = get(paste0('win', j)))] # calc num snps per window
rem[1] <- rem[1] + pwwin[, sum(nsnps < minloci)] # record number to be removed
pwwin[, (paste0('win', j, 'keep')) := 1] # create col to mark which windows to keep
pwwin[nsnps < minloci, (paste0('win', j, 'keep')) := 0] # mark windows to remove
pwwin[, nsnps := NULL] # drop column
setnames(pwwin, "win", paste0('win', j)) # change col name
pw <- merge(pw, pwwin, by = paste0('win', j), all.x = TRUE) # merge keeper col back to full dataset
}
PWS[[i]]<-pw
}
rem # number of windows removed for each comparison
####################################
# shuffle and recalc windowed FST
####################################
colnms <- c('CHROM', 'POS', paste0('win', 1:(winsz/winstp)), paste0('win', 1:(winsz/winstp), 'keep')) # list of column names we want out of the base data.table
# PWS07-17
pops<-c("PWS91.96","PWS91.07","PWS91.17","PWS96.07","PWS96.17","PWS07.17")
for(p in 1:length(PWS)){
print(paste0('Starting ', pops[p]))
pw<-PWS[[p]]
for(i in 1:nrep){
cat(i); cat(' ')
# create new dataset
inds <- sample(1:nrow(pw), nrow(pw), replace = FALSE)
temp <- cbind(pw[, ..colnms], pw[inds, .(A, B)]) # shuffle FSTs across positions
# calc fst for each window to keep
for(j in 1:(winsz/winstp)){
temp2 <- temp[get(paste0('win', j, 'keep')) == 1, ] # trim to windows to keep. can't combine with next line for some reason.
if(j ==1) tempfsts <- temp2[, .(fst = sum(A)/sum(B)), by = .(CHROM, POS = get(paste0('win', j)))]
if(j > 1) tempfsts <- rbind(tempfsts, temp2[, .(fst = sum(A)/sum(B)), by = .(CHROM, POS = get(paste0('win', j)))])
}
# save the max windowed fst
# exclude windows with negative midpoints
if(i == 1) maxfst <- tempfsts[POS > 0, max(fst, na.rm = TRUE)]
if(i > 1) maxfst <- c(maxfst, tempfsts[POS > 0, max(fst, na.rm = TRUE)])
}
print(paste('Max:', max(maxfst, na.rm = TRUE), '; 95th:', quantile(maxfst, prob = 0.95, na.rm = TRUE)))
write.csv(maxfst, gzfile(paste0('/home/ktist/ph/data/angsd/analysis/',pops[p], '_siteshuuffle.csv.gz')), row.names = FALSE)
rm(maxfst)
}
Need - sitehuffle results fst_siteshuuffle.csv.gz - window-based Fst results from angsd _50kWindow_maf00 - persite Fst files after LD pruning (with AB info) *_persite_maf00_ldtrim.csv.gz
#from https://github.com/pinskylab/codEvol
# parameters
minloci <- 10
winsz <- 50000 # window size
winstp <- 10000 # window step
# load functions
require(data.table)
require(ggplot2)
calcp <- function(fst, null) return((sum(null > fst)+1)/(length(null)+1)) # equation from North et al. 2002 Am J Hum Gen
# read in and prep data
years<-c("PWS91","PWS96","PWS07","PWS17")
comb<-t(combn(years, 2))
# continuous nucleotide position for the whole genome
chrmax <- fread('../Data/new_vcf/chr_sizes.bed')
chrmax<-chrmax[,-2]
colnames(chrmax)<-c("chr", "len")
chrmax$start<-c(0,cumsum(chrmax$len)[1:(nrow(chrmax)-1)])
setkey(chrmax, chr)
prunedFst_gw<-data.frame(pop1=comb[,1],pop2=comb[,2])
#for (p in 1: nrow(comb)){
for (p in 1:5){
#genome-wide max Fst from reshuffling (unlinked sites)
null<-fread(paste0('Data/shuffle/', comb[p,1],".",comb[p,2],'_fst_siteshuuffle.csv.gz'))
#window-based Fst from angsd
fstwin <- fread(paste0('Data/new_vcf/angsd/fromVCF/2D/fst_',comb[p,1],"_",comb[p,2],'_50kWindow_maf00'), header = FALSE, col.names = c('region', 'chr', 'midPos', 'Nsites', 'fst')) # all sites
#persite fst (AB) -pruned sites
AB <- fread(paste0('Data/shuffle/fst_',comb[p,1],"_",comb[p,2],'_persite_maf00_ldtrim.csv.gz'))
# merge nucleotide position into the frequency files
setkey(AB, CHROM)
AB <- AB[chrmax[, .(CHROM = chr, start)], ]
AB[, posgen := POS + start]
AB[,start := NULL]
# Calc genome-wide FST
prunedFst_gw$gwFst[p]<-AB[!is.na(A), sum(A)/sum(B)]
# Calc windowed FST
# create new columns as indices for windows
for(j in 1:(winsz/winstp)){
AB[, (paste0('win', j)) := floor((POS - (j-1)*winstp)/winsz)*winsz + winsz/2 + (j-1)*winstp]
}
# calc fst and # snps per window
for(j in 1:(winsz/winstp)){
if(j ==1){
fstwin <- AB[, .(fst = sum(A)/sum(B), nloci = length(POS)), by = .(CHROM, midPos = get(paste0('win', j)))]
}
if(j > 1){
fstwin <- rbind(fstwin, AB[, .(fst = sum(A)/sum(B), nloci = length(POS)), by = .(CHROM, midPos = get(paste0('win', j)))])
}
}
## null model stats
fstats<-null[, .(max = max(x), u95 = quantile(x, probs = 0.95))]
prunedFst_gw$null.Fstmax[p]<-fstats$max[1]
prunedFst_gw$null.Fst.95q[p]<-fstats$u95[1]
#Calculate p-values
pval<-fstwin[, p := calcp(fst, null$x), by = .(CHROM, midPos)]
#combined the datasets
pair<-paste0(comb[p,1],"_",comb[p,2])
fstwin[, pop := pair ]
write.csv(fstwin, paste0("../Output/Fst/fst_siteshuffle.pairwise_", pair,".csv"))
}
write.csv(prunedFst_gw, "../Output/Fst/fst_siteshuffle_pws_summary.csv")
data<-data.frame()
for(i in 1: nrow(comb)){
pair<-paste0(comb[i,1],"_",comb[i,2])
df<-fread(paste0("../Output/Fst/fst_siteshuffle.pairwise_", pair,".csv"))
df<-df[!is.na(fst) & midPos > 0, ]
df[,ch:= as.integer(gsub("chr",'', CHROM))]
setkey(df,CHROM)
df<-df[chrmax[, .(CHROM = chr, start)], ]
df[,pos.gw:= midPos+start]
#setkey(df, ch, midPos)
data<-rbind(data, df)
}
write.csv(data, file = gzfile('../Output/Fst/fst_siteshuffle.angsd.PWS.csv.gz'), row.names = FALSE)data<-fread('../Output/Fst/fst_siteshuffle.angsd.PWS.csv.gz')
#pop as factor
data[, pop := factor(pop, levels = c("PWS91_PWS96","PWS91_PWS07","PWS91_PWS17","PWS96_PWS07","PWS96_PWS17","PWS07_PWS17"))]
# plot p-value vs. position
two_cols<-rep(c("steelblue","lightblue"), times=13)
minloci <- 10 # minimum number of loci per window to consider
ggplot(data[nloci >= minloci, ], aes(pos.gw, -log10(p), color = factor(ch))) +
geom_point(size = 0.2, alpha = 0.5) +
facet_wrap(~pop, ncol = 1) +
scale_color_manual(values = two_cols, guide='none') +
geom_hline(yintercept = -log10(0.05), linetype = 'dashed', color = 'grey')+
theme_bw()+
theme(axis.text.x = element_blank())+
xlab("Genome")
ggsave("../Output/Fst/fst.siteshuffle.p_vs_pos.PWS.png", width = 7.5, height = 7.5,dpi = 300)# plot p-value vs. nloci to check if any biases exist
ggplot(data, aes(nloci, -log10(p), color = pop)) +
geom_point(size = 0.6, alpha = 0.5) +
geom_hline(yintercept = -log10(0.05), linetype = 'dashed', color = 'grey') +
scale_x_log10()+theme_bw()+theme(legend.title = element_blank())+
guides(color = guide_legend(override.aes = list(size = 3, alpha=1)))
ggsave("../Output/Fst/fst.siteshuffle.p_vs_nloci.PWS.png", width = 8, height =4,dpi = 300)```r
outliers<-data[p < 0.05, ]
write.csv(outliers, \../Output/Fst/Fst_nullshuffling_outliers_PWS.csv\)
counts<-data.table(table(outliers$pop))
ggplot(counts, aes(x=V1, y=N))+
geom_bar(stat=\identity\, fill=cols[4])+
xlab('')+ylab('')+
theme_classic()+
theme(axis.text.x = element_text(angle=45, hjust=1))
ggsave(\../Output/Fst/Fst_outliers_byComparison.png\, width = 4, height = 3, dpi=300 )
counts$V1<-factor(counts$V1, levels=c(\PWS96_PWS07\,\PWS07_PWS17\,\PWS91_PWS96\, \PWS91_PWS07\, \PWS91_PWS17\,\PWS96_PWS17\))
ggplot(counts, aes(x=V1, y=N, fill=V1))+
geom_bar(stat=\identity\)+
scale_fill_manual(name = \V\,values = div2)+
xlab('')+ylab('Number of regions with P<0.05')+
theme_classic()+ggtitle(\Fst (shuffle)\)+
theme(axis.text.x = element_text(angle=45, hjust=1), legend.title = element_blank())
ggsave(\../Output/Fst/Fst_outliers_byComparison_ordered.png\, width = 4, height = 3, dpi=300 )
#outliers<-read.csv(\../Output/Fst/Fst_nullshuffling_outliers_PWS.csv\, row.names = 1)
knitr::kable(outliers[,c(2,3,4,5,6,7,8)], caption=\Outlier regions\)
#Outlier regions
#CHROM midPos fst nloci p pop ch
#chr25 825000 0.1210997 138 0.003996 PWS96_PWS07 25
#chr25 835000 0.0818802 279 0.020979 PWS96_PWS07 25
#chr25 845000 0.0721926 311 0.033966 PWS96_PWS07 25
#chr25 805000 0.1242652 166 0.002997 PWS96_PWS07 25
#chr25 815000 0.1377218 155 0.002997 PWS96_PWS07 25
#chr19 1375000 0.1251602 42 0.003996 PWS07_PWS17 19```
* NO overlapping regions from PCAngsd selection
scan